rm(list=ls())
library(pheatmap)
library(ggplot2)
library(dichromat)
library(dplyr)
library(ggrepel)
library(reshape2)
library(umap)
library(ggthemes)
library(cowplot)
library(DEP)
library(readr)
library(naniar)
library(SummarizedExperiment)
library(data.table)
library(readxl)
library(writexl)
library(stringr)
library(vsn)
library(rmarkdown)
# library(stringr)
# library(MetaboAnalystR)
# library(matrixStats)
# library(wesanderson)
# library(clusterProfiler)
# library(enrichplot)
# library(msigdbr)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
# create directory for results
dir.create(file.path(getwd(),'results'), showWarnings = FALSE)
# create directory for plots
dir.create(file.path(getwd(),'plots'), showWarnings = FALSE)
Loading data including:
The first step is to merge the information of the “All templates” and “Shipment of all boxes” files, and create 3 datasets (one for plasma, one for CSF, one for urine) with the information of Patient ID, Position, Biofluid, Visit, Tube number, Country, Rack and Box.
## sample info - CSF
paged_table(lipidomics_CSF_all)
dim(lipidomics_CSF_all)
## [1] 406 8
lipidomics_CSF_all_summary <- lipidomics_CSF_all %>%
group_by(Visit) %>%
summarise(
total_patients = n_distinct(Patient, na.rm = TRUE),
total_tubes = n_distinct(Tube_number, na.rm = TRUE),
total_patients_tubes_pairs = n_distinct(
paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)]))
lipidomics_CSF_all_summary
## sample info - plasma
paged_table(lipidomics_plasma_all)
dim(lipidomics_plasma_all)
## [1] 431 8
lipidomics_plasma_all_summary <- lipidomics_plasma_all %>%
group_by(Visit) %>%
summarise(
total_patients = n_distinct(Patient, na.rm = TRUE),
total_tubes = n_distinct(Tube_number, na.rm = TRUE),
total_patients_tubes_pairs = n_distinct(
paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)]))
lipidomics_plasma_all_summary
## sample info - urine
paged_table(lipidomics_urine_all)
dim(lipidomics_urine_all)
## [1] 430 8
lipidomics_urine_all_summary <- lipidomics_urine_all %>%
group_by(Visit) %>%
summarise(
total_patients = n_distinct(Patient, na.rm = TRUE),
total_tubes = n_distinct(Tube_number, na.rm = TRUE),
total_patients_tubes_pairs = n_distinct(
paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)]))
lipidomics_urine_all_summary
The second step is to aggregate all the raw data files together and save the batch information, in order to check later if there are any batch effects in the data.
paged_table(raw_data_CSF_final)
dim(raw_data_CSF_final)
## [1] 174 894
# how many rows of QC
length(grep("QC",raw_data_CSF_final$Tube_number))
## [1] 30
paged_table(raw_data_plasma_final)
dim(raw_data_plasma_final)
## [1] 382 894
# how many rows of QC
length(grep("QC",raw_data_plasma_final$Tube_number))
## [1] 66
The third step is to combine the raw data files with the patient information, and check the amount of samples per visit for each fluid.
raw_data_CSF_IDs = raw_data_CSF_final %>%
select(Tube_number,Batch)
raw_data_CSF_IDs <- raw_data_CSF_IDs %>%
mutate(Tube_number4 = str_sub(Tube_number,-4)) %>% # last 4 digits
filter(!str_detect(Tube_number, "QC")) # remove 30 QC IDs
# how many IDs in the raw data CSF
length(raw_data_CSF_IDs$Tube_number)
## [1] 144
# how many unique IDs in the raw data CSF
length(unique(raw_data_CSF_IDs$Tube_number))
## [1] 144
lipidomics_CSF_all <- lipidomics_CSF_all %>%
mutate(Tube_number4 = str_sub(Tube_number,-4)) # last 4 digits
full_match_CSF = raw_data_CSF_IDs %>%
left_join(lipidomics_CSF_all,by = "Tube_number") %>%
mutate(match_type = ifelse(!is.na(Tube_number4.y),"Full",NA))
no_match_CSF = full_match_CSF %>%
filter(is.na(match_type)) %>%
dplyr::rename(Tube_number4 = Tube_number4.y) %>%
select(-names(lipidomics_CSF_all)) %>%
dplyr::rename(Tube_number4 = Tube_number4.x)
last4_match_CSF = no_match_CSF %>%
left_join(lipidomics_CSF_all,by = "Tube_number4") %>%
mutate(match_type = ifelse(!is.na(Tube_number),"Last4","No match"))
# final description of the samples available in the raw data without QC samples
combined_data_CSF = full_match_CSF %>%
filter(!is.na(match_type)) %>%
select(-c(Tube_number4.x,Tube_number4.y,match_type)) %>%
bind_rows(last4_match_CSF %>%
select(-Tube_number) %>%
left_join(raw_data_CSF_IDs %>% select(Tube_number,Tube_number4)) %>%
select(Tube_number,Patient,Position,Biofluid,Visit,Country,Rack,Box,Batch)) %>%
distinct()
paged_table(combined_data_CSF)
# how many IDs in the raw data CSF now
length(combined_data_CSF$Tube_number)
## [1] 147
# how many unique IDs in the raw data CSF now
length(unique(combined_data_CSF$Tube_number))
## [1] 144
# how many unique Patient IDs
length(unique(combined_data_CSF$Patient))
## [1] 113
# what is duplicated
combined_data_CSF[duplicated(combined_data_CSF$Tube_number),]
# how many tube numbers and patient IDs per V0 and V1
combined_data_CSF %>%
group_by(Visit) %>%
summarise(
total_patients = n_distinct(Patient, na.rm = TRUE), # amount of patient IDs
total_tubes = n_distinct(Tube_number, na.rm = TRUE), # amount of tube numbers
total_patients_tubes_pairs = n_distinct(
paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)])) # amount of pairs
# patients at V0 and V1
patients_CSF_V0 <- combined_data_CSF %>%
filter(Visit == "V0") %>%
pull(Patient) %>%
unique()
patients_CSF_V1 <- combined_data_CSF %>%
filter(Visit == "V1") %>%
pull(Patient) %>%
unique() %>%
na.omit()
# check if all patients at V1 are at V0
all(patients_CSF_V1 %in% patients_CSF_V0)
## [1] FALSE
patients_CSF_V1[!patients_CSF_V1 %in% patients_CSF_V0] # patient TR319
## [1] "TR319"
# check patient in the info_patient dataset
lipidomics_CSF_all[lipidomics_CSF_all$Patient %in% patients_CSF_V1[!patients_CSF_V1 %in% patients_CSF_V0],]
raw_data_plasma_IDs = raw_data_plasma_final %>%
select(Tube_number,Batch)
raw_data_plasma_IDs <- raw_data_plasma_IDs %>%
mutate(Tube_number4 = str_sub(Tube_number,-4)) %>% # last 4 digits
filter(!str_detect(Tube_number, "QC")) # remove 66 QC IDs
# how many IDs in the raw data plasma
length(raw_data_plasma_IDs$Tube_number)
## [1] 316
# how many unique IDs in the raw data plasma
length(unique(raw_data_plasma_IDs$Tube_number))
## [1] 316
lipidomics_plasma_all <- lipidomics_plasma_all %>%
mutate(Tube_number4 = str_sub(Tube_number,-4)) # last 4 digits
full_match_plasma = raw_data_plasma_IDs %>%
left_join(lipidomics_plasma_all,by = "Tube_number") %>%
mutate(match_type = ifelse(!is.na(Tube_number4.y),"Full",NA))
no_match_plasma = full_match_plasma %>%
filter(is.na(match_type)) %>%
dplyr::rename(Tube_number4 = Tube_number4.y) %>%
select(-names(lipidomics_plasma_all)) %>%
dplyr::rename(Tube_number4 = Tube_number4.x)
last4_match_plasma = no_match_plasma %>%
left_join(lipidomics_plasma_all,by = "Tube_number4") %>%
mutate(match_type = ifelse(!is.na(Tube_number),"Last4","No match"))
# final description of the samples available in the raw data without QC samples
combined_data_plasma = full_match_plasma %>%
filter(!is.na(match_type)) %>%
select(-c(Tube_number4.x,Tube_number4.y,match_type)) %>%
bind_rows(last4_match_plasma %>%
select(-Tube_number) %>%
left_join(raw_data_plasma_IDs %>% select(Tube_number,Tube_number4)) %>%
select(Tube_number,Patient,Position,Biofluid,Visit,Country,Rack,Box,Batch)) %>%
distinct()
paged_table(combined_data_plasma)
# how many IDs in the raw data plasma now
length(combined_data_plasma$Tube_number)
## [1] 316
# how many unique IDs in the raw data plasma now
length(unique(combined_data_plasma$Tube_number))
## [1] 316
# how many unique Patient IDs
length(unique(combined_data_plasma$Patient))
## [1] 200
# how many tube numbers and patient IDs per V0 and V1
combined_data_plasma %>%
group_by(Visit) %>%
summarise(
total_patients = n_distinct(Patient, na.rm = TRUE), # amount of patient IDs
total_tubes = n_distinct(Tube_number, na.rm = TRUE), # amount of tube numbers
total_patients_tubes_pairs = n_distinct(
paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)])) # amount of pairs
# patients at V0 and V1
patients_plasma_V0 <- combined_data_plasma %>%
filter(Visit == "V0") %>%
pull(Patient) %>%
unique()
patients_plasma_V1 <- combined_data_plasma %>%
filter(Visit == "V1") %>%
pull(Patient) %>%
unique() %>%
na.omit()
# check if all patients at V1 are at V0
all(patients_plasma_V1 %in% patients_plasma_V0)
## [1] FALSE
patients_plasma_V1[!patients_plasma_V1 %in% patients_plasma_V0] # patients TR207, TR116 and TR123
## [1] "TR207" "TR116" "TR123"
# check patient in the info_patient dataset
lipidomics_plasma_all[lipidomics_plasma_all$Patient %in% patients_plasma_V1[!patients_plasma_V1 %in% patients_plasma_V0],]
The fourth step is to remove the QC rows, save the experimental setup (info of patient ID, batch, sample ID, visit, etc.), structure the lipidomics dataset to numeric and create summarised experiments datasets.
The fifth step is to check for missing omics per patient and overall, filter lipids that are not quantified in less than 1/3 of the samples and normalise the data (using vsn).
The sixth step is to create visualizations, such as heatmaps and PCAs (for each fluid with ID/batch info, and with both fluids together)
The seventh step is to perform differential expression analyses, with particular interest of eALS vs CTR, PGMC vs CTR. For this, the information between IDs and the status (eALS, CTR, PGMC, mimic) have to be collected from other datasets